#################################################################
## Tyler Pratt ##################################################
## "Deference and Hierarchy in International Regime Complexes" ##
## Code to Estimate Topic Model ################################# 
#################################################################

## topic models for CT, EM, IPR policy documents
library(stm)
library(tm)

setwd()
## Import text data from counterterrorism policy documents
CTdata <- read.csv("CTdocs.csv")


processed <- textProcessor(CTdata$documents, metadata=CTdata)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)

## Fit the structural topic model
model1 <- selectModel(out$documents, out$vocab, K=10, 
                      prevalence = ~ as.factor(IGO) + as.factor(Year), max.em.its = 500,
                      runs=20, data = out$meta, seed = 51111)

plotModels(model1) # examine models based on exclusivity and semantic coherence
ten <- model1$runout[[2]] # select best model
labelTopics(ten)

## identify substantive topics by examining most frequent words in each topic
# for replication, topics will need to be re-categorized due to label switching
library(wordcloud) 
cloud(ten, topic=1, max.words = 20) 
cloud(ten, topic=2, max.words = 20) 
cloud(ten, topic=3, max.words = 20) 
cloud(ten, topic=4, max.words = 20) 
cloud(ten, topic=5, max.words = 20) 
cloud(ten, topic=6, max.words = 20) 
cloud(ten, topic=7, max.words = 20) 
cloud(ten, topic=8, max.words = 20) 
cloud(ten, topic=9, max.words = 20) 
cloud(ten, topic=10, max.words = 20) 


topics.ten <- cbind(CTdata[ ,1:3], as.data.frame(ten$theta))
colnames(topics.ten)[4:13] <- 1:10

## Check topic distribution for each year
topic.dist <- as.data.frame(matrix(nrow=1, ncol=12))
colnames(topic.dist) <- c("IGO", "year", "one", "two", "three",
                          "four", "five", "six", "seven", "eight",
                          "nine", "ten")
for(i in sort(unique(CTdata$Year))){
  sub <- topics.ten[topics.ten$Year == i,]
  orgs.rel <- unique(sub$IGO)
  sub.rel <- as.data.frame(matrix(nrow = length(orgs.rel), ncol = 12))
  colnames(sub.rel) <- c("IGO", "year", "one", "two", "three",
                         "four", "five", "six", "seven", "eight",
                         "nine", "ten")
  sub.rel[ ,1] <- orgs.rel
  sub.rel[ ,2] <- i
  for(j in 1:length(orgs.rel)){
    info <- sub[sub$IGO == orgs.rel[j], ]
    sub.rel[j, 3:12] <- colSums(info[ ,4:13]) / nrow(info)
  }
  topic.dist <- rbind(topic.dist, sub.rel, deparse.level=1)
  print(i)
}
topic.dist <- topic.dist[-1,]
write.csv(topic.dist, "CT_stm_ten.csv")



## Repeat with text data for IPR
IPRdata <- read.csv("IPRdocs.csv")

processed <- textProcessor(IPRdata$documents, metadata=IPRdata)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)

model1 <- selectModel(out$documents, out$vocab, K=10, 
                      prevalence = ~ as.factor(IGO) + as.factor(Year), max.em.its = 500,
                      runs=20, data = out$meta, seed = 51111)
save(model1, file = "IPR_fittedtopic.Rdata")
load("IPR_fittedtopic.Rdata")

plotModels(model1)
ten <- model1$runout[[4]] 
labelTopics(ten)

library(wordcloud)
cloud(ten, topic=1, max.words = 10) 
cloud(ten, topic=2, max.words = 10) 

topics.ten <- cbind(IPRdata[ ,1:3], as.data.frame(ten$theta))
colnames(topics.ten)[4:13] <- 1:10

## Check topic distribution for each year
topic.dist <- as.data.frame(matrix(nrow=1, ncol=12))
colnames(topic.dist) <- c("IGO", "year", "one", "two", "three",
                          "four", "five", "six", "seven", "eight",
                          "nine", "ten")
sort(unique(IPRdata$Year))
for(i in c(1992, 1994:2013)){
  sub <- topics.ten[topics.ten$Year == i,]
  orgs.rel <- unique(sub$IGO)
  sub.rel <- as.data.frame(matrix(nrow = length(orgs.rel), ncol = 12))
  colnames(sub.rel) <- c("IGO", "year", "one", "two", "three",
                         "four", "five", "six", "seven", "eight",
                         "nine", "ten")
  sub.rel[ ,1] <- orgs.rel
  sub.rel[ ,2] <- i
  for(j in 1:length(orgs.rel)){
    info <- sub[sub$IGO == orgs.rel[j], ]
    sub.rel[j, 3:12] <- colSums(info[ ,4:13]) / nrow(info)
  }
  topic.dist <- rbind(topic.dist, sub.rel, deparse.level=1)
  print(i)
}
topic.dist <- topic.dist[-1,]
write.csv(topic.dist, "IPR_stm_ten.csv")


## EM policy docs
EMdata <- read.csv("EMdocs.csv")

processed <- textProcessor(EMdata$documents, metadata=EMdata)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)

## Fit the structural topic model
model.EM <- selectModel(out$documents, out$vocab, K=10, 
                        prevalence = ~ as.factor(IGO) + as.factor(Year), max.em.its = 300,
                        runs=20, data = out$meta, seed = 51111)

plotModels(model.EM)
ten <- model.EM$runout[[1]] 

labelTopics(ten)

topics.ten <- cbind(EMdata[ ,1:3], as.data.frame(ten$theta))
colnames(topics.ten)[4:13] <- 1:10

## aggregate topic distribution for each year
topic.dist <- as.data.frame(matrix(nrow=1, ncol=12))
colnames(topic.dist) <- c("IGO", "year", "one", "two", "three",
                          "four", "five", "six", "seven", "eight",
                          "nine", "ten")
for(i in 1988:2013){
  sub <- topics.ten[topics.ten$Year == i,]
  orgs.rel <- unique(sub$IGO)
  sub.rel <- as.data.frame(matrix(nrow = length(orgs.rel), ncol = 12))
  colnames(sub.rel) <- c("IGO", "year", "one", "two", "three",
                         "four", "five", "six", "seven", "eight",
                         "nine", "ten")
  sub.rel[ ,1] <- orgs.rel
  sub.rel[ ,2] <- i
  for(j in 1:length(orgs.rel)){
    info <- sub[sub$IGO == orgs.rel[j], ]
    sub.rel[j, 3:12] <- colSums(info[ ,4:13]) / nrow(info)
  }
  topic.dist <- rbind(topic.dist, sub.rel, deparse.level=1)
  print(i)
}
topic.dist <- topic.dist[-1,]
write.csv(topic.dist, "EM_stm_ten.csv")



## Division of labor among IOs is calculated as the absolute difference in
# the two organizations' distribution of topics in each year
# (see Deference_Replication.R for example of OSCE-SCO and OSCE-EU)



